In [1]:
import nltk;
import string;
import math;
import csv;
(this notebook requires NLTK)
In [2]:
# read text file
text_path = "data/crime-and-punishment.txt";
with open(text_path) as f:
text_raw = f.read().lower();
# remove punctuation
translate_table = dict((ord(char), None) for char in string.punctuation);
text_raw = text_raw.translate(translate_table);
# tokenize
tokens = nltk.word_tokenize(text_raw);
bigrams = nltk.bigrams(tokens);
# unigram/bigram frequencies
unigram_counts = nltk.FreqDist(tokens);
bigram_counts = nltk.FreqDist(bigrams);
# write to file
unigram_path = text_path + ".unigrams";
bigram_path = text_path + ".bigrams";
with open(unigram_path, "w") as f:
writer = csv.writer(f);
filtered = [ (w,c) for w,c in unigram_counts.items() if c > 1];
writer.writerows(filtered);
with open(bigram_path, "w") as f:
writer = csv.writer(f);
filtered = [ (b[0], b[1],c) for b,c in bigram_counts.items() if c > 3];
writer.writerows(filtered);
Here are the top few most common words:
In [3]:
unigram_counts.most_common(20)
Out[3]:
[('the', 7807),
('and', 6902),
('to', 5266),
('he', 4657),
('a', 4568),
('i', 3939),
('you', 3807),
('of', 3806),
('in', 3188),
('it', 2973),
('that', 2913),
('was', 2820),
('his', 2115),
('at', 2064),
('her', 1823),
('but', 1780),
('not', 1778),
('with', 1706),
('for', 1648),
('she', 1628)]
Below are the most commmon word pairs. These aren't collocations!
In [4]:
bigram_counts.most_common(20)
Out[4]:
[(('in', 'the'), 778),
(('of', 'the'), 598),
(('he', 'was'), 505),
(('he', 'had'), 498),
(('to', 'the'), 488),
(('on', 'the'), 479),
(('i', 'am'), 460),
(('at', 'the'), 459),
(('it', 'was'), 413),
(('that', 'he'), 335),
(('you', 'are'), 326),
(('to', 'be'), 308),
(('in', 'a'), 307),
(('do', 'you'), 292),
(('with', 'a'), 264),
(('did', 'not'), 256),
(('was', 'a'), 249),
(('for', 'the'), 246),
(('at', 'once'), 244),
(('and', 'he'), 241)]
To find collocations, we sort pairs of words by their pointwise mutual information, $$ \mathrm{pmi}(x;y) = \log \frac{p(x,y)}{p(x)p(y)} $$
In [5]:
# compute pmi
pmi_bigrams = [];
for bigram,_ in bigram_counts.most_common(1000):
w1, w2 = bigram;
# compute pmi
actual = bigram_counts[bigram];
expected = unigram_counts[w1] * unigram_counts[w2];
pmi = math.log( actual / expected );
pmi_bigrams.append( (w1, w2, pmi) );
# sort pmi
pmi_sorted = sorted(pmi_bigrams, key=lambda x: x[2], reverse=True);
Here are the top 30 collocations according to PMI:
In [6]:
pmi_sorted[:30]
Out[6]:
[('nikodim', 'fomitch', -3.1780538303479458),
('andrey', 'semyonovitch', -3.1780538303479458),
('dmitri', 'prokofitch', -3.871201010907891),
('sofya', 'semyonovna', -4.330733340286331),
('marfa', 'petrovna', -4.37158498596076),
('rodion', 'romanovitch', -4.574710978503383),
('avdotya', 'romanovna', -4.74493212836325),
('pulcheria', 'alexandrovna', -4.820281565605037),
('great', 'deal', -5.2805000013568755),
('good', 'heavens', -5.509550266836412),
('katerina', 'ivanovnas', -5.569434422125123),
('ilya', 'petrovitch', -5.636573724962751),
('pyotr', 'petrovitch', -5.665343459987014),
('katerina', 'ivanovna', -5.731418449229828),
('amalia', 'ivanovna', -5.87493073085203),
('make', 'haste', -5.961996125397379),
('each', 'other', -6.009777970852694),
('head', 'clerk', -6.170751200206783),
('old', 'woman', -6.17264389932316),
('any', 'case', -6.267612970702526),
('sat', 'down', -6.283876793935164),
('long', 'ago', -6.322955453378991),
('sit', 'down', -6.343232735746065),
('porfiry', 'petrovitch', -6.519492016470095),
('an', 'hour', -6.645639872435834),
('no', 'doubt', -6.7665084847599015),
('young', 'man', -6.980870198698654),
('let', 'us', -7.152727800737759),
('my', 'dear', -7.185930346210463),
('excuse', 'me', -7.317169568047098)]
Just for fun, here are the bottom 30 collocations according to PMI. These are the word pairs that occur together less frequently than expected:
In [7]:
pmi_sorted[-30:]
Out[7]:
[('was', 'that', -12.663334382870111),
('you', 'that', -12.666187451852519),
('and', 'for', -12.691536152637951),
('her', 'he', -12.696268979639559),
('have', 'the', -12.711844297245333),
('had', 'he', -12.768049453586553),
('you', 'to', -12.770571828604202),
('it', 'you', -12.776187676580868),
('as', 'the', -12.792253213656782),
('you', 'and', -12.80749714102439),
('it', 'it', -12.903611192637879),
('but', 'to', -12.917900767413236),
('and', 'it', -12.94507302854542),
('that', 'in', -12.953045343511224),
('you', 'you', -12.962832988148833),
('to', 'a', -12.98852390615963),
('had', 'the', -13.071124656426408),
('that', 'a', -13.071564998120076),
('you', 'in', -13.092869971424403),
('to', 'it', -13.132365958569206),
('to', 'that', -13.178669267029216),
('it', 'a', -13.205281749465744),
('you', 'he', -13.224987596565148),
('and', 'to', -13.331055399808484),
('i', 'to', -13.35120094156068),
('that', 'and', -13.597629435749182),
('you', 'a', -13.62690506999492),
('it', 'the', -13.702006331096907),
('and', 'of', -13.71660472152757),
('you', 'the', -13.773385598017642)]
In [8]:
unigram_path = "data/crime-and-punishment.txt.unigrams";
bigram_path = "data/crime-and-punishment.txt.bigrams";
with open(unigram_path) as f:
reader = csv.reader(f);
unigrams = { row[0] : int(row[1]) for row in csv.reader(f)}
with open(bigram_path) as f:
reader = csv.reader(f);
bigrams = { (row[0],row[1]) : int(row[2]) for row in csv.reader(f)}
In [9]:
bigrams
Out[9]:
{('to', 'hell'): 5,
('room', 'the'): 13,
('people', 'to'): 4,
('the', 'assistant'): 10,
('so', 'soon'): 5,
('get', 'out'): 5,
('and', 'seemed'): 8,
('the', 'murder'): 24,
('gazed', 'at'): 20,
('stairs', 'to'): 5,
('discuss', 'it'): 4,
('me', 'all'): 5,
('an', 'axe'): 12,
('to', 'treat'): 4,
('in', 'very'): 7,
('i', 'look'): 9,
('why', 'who'): 4,
('now', 'not'): 4,
('a', 'time'): 26,
('among', 'them'): 9,
('direction', 'of'): 9,
('and', 'never'): 4,
('last', 'week'): 6,
('the', 'palais'): 6,
('he', 'turned'): 47,
('known', 'it'): 4,
('on', 'her'): 46,
('what', 'made'): 12,
('and', 'out'): 4,
('and', 'for'): 35,
('to', 'zossimov'): 5,
('him', 'but'): 41,
('why', 'he'): 20,
('a', 'cup'): 7,
('wont', 'have'): 5,
('her', 'work'): 4,
('that', 'minute'): 7,
('go', 'on'): 32,
('her', 'voice'): 10,
('possibility', 'of'): 7,
('so', 'isnt'): 5,
('your', 'eyes'): 5,
('asked', 'in'): 10,
('why', 'is'): 14,
('for', 'half'): 8,
('knew', 'it'): 10,
('on', 'that'): 24,
('waked', 'up'): 9,
('three', 'roubles'): 8,
('what', 'right'): 5,
('you', 'again'): 5,
('everyone', 'to'): 4,
('or', 'i'): 7,
('in', 'two'): 10,
('end', 'to'): 8,
('petrovna', 'and'): 5,
('did', 'they'): 4,
('the', 'empty'): 7,
('were', 'looking'): 6,
('in', 'that'): 64,
('with', 'sonia'): 7,
('door', 'i'): 4,
('all', 'his'): 36,
('suspicion', 'and'): 4,
('it', 'no'): 6,
('and', 'where'): 15,
('mother', 'has'): 5,
('you', 'ive'): 6,
('here', 'he'): 22,
('hand', 'but'): 5,
('we', 'could'): 5,
('there', 'may'): 4,
('illness', 'and'): 4,
('of', 'money'): 11,
('aloud', 'and'): 4,
('full', 'possession'): 6,
('some', 'moments'): 4,
('be', 'over'): 4,
('ought', 'not'): 11,
('under', 'his'): 13,
('is', 'open'): 4,
('perhaps', 'have'): 4,
('so', 'a'): 4,
('nerves', 'were'): 5,
('asked', 'for'): 7,
('simply', 'from'): 9,
('have', 'anything'): 4,
('his', 'visit'): 7,
('and', 'turning'): 6,
('a', 'mother'): 5,
('beside', 'himself'): 4,
('are', 'an'): 9,
('its', 'only'): 10,
('know', 'your'): 6,
('at', 'their'): 10,
('smile', 'but'): 6,
('rushing', 'to'): 4,
('certainly', 'did'): 5,
('but', 'we'): 24,
('rodya', 'rodya'): 5,
('yourself', 'that'): 7,
('the', 'hospital'): 14,
('again', 'in'): 13,
('mother', 'who'): 4,
('to', 'follow'): 6,
('me', 'very'): 5,
('expected', 'that'): 5,
('told', 'a'): 6,
('fancy', 'i'): 5,
('the', 'inner'): 4,
('i', 'ought'): 16,
('head', 'of'): 5,
('her', 'a'): 20,
('her', 'mouth'): 8,
('was', 'pale'): 7,
('strange', 'to'): 21,
('of', 'some'): 19,
('a', 'special'): 12,
('in', 'perplexity'): 6,
('observed', 'that'): 4,
('rushed', 'to'): 19,
('the', 'night'): 18,
('yet', 'he'): 11,
('you', 'cried'): 6,
('you', 'may'): 46,
('door', 'of'): 10,
('bring', 'him'): 5,
('wont', 'you'): 8,
('oh', 'how'): 8,
('well', 'have'): 8,
('cinq', 'sous'): 6,
('her', 'how'): 6,
('too', 'little'): 4,
('face', 'in'): 9,
('good', 'manners'): 4,
('in', 'common'): 4,
('different', 'matter'): 7,
('she', 'cried'): 30,
('by', 'some'): 6,
('got', 'on'): 6,
('and', 'read'): 7,
('every', 'day'): 10,
('it', 'he'): 73,
('his', 'linen'): 6,
('is', 'rather'): 4,
('ivanovna', 'you'): 8,
('said', 'nothing'): 12,
('by', 'my'): 6,
('timidly', 'at'): 4,
('on', 'living'): 11,
('with', 'lizaveta'): 4,
('they', 'dont'): 18,
('sit', 'down'): 38,
('went', 'away'): 10,
('send', 'for'): 5,
('so', 'ill'): 4,
('if', 'the'): 11,
('frightened', 'by'): 4,
('have', 'made'): 12,
('know', 'its'): 4,
('said', 'at'): 7,
('and', 'fixed'): 4,
('dont', 'say'): 8,
('it', 'right'): 4,
('honest', 'and'): 7,
('it', 'seems'): 11,
('no', 'reply'): 9,
('you', 'both'): 5,
('been', 'here'): 14,
('said', 'goodbye'): 4,
('it', 'has'): 18,
('to', 'beat'): 5,
('if', 'so'): 4,
('kept', 'his'): 5,
('now', 'that'): 22,
('of', 'your'): 57,
('find', 'that'): 5,
('quickly', 'and'): 8,
('just', 'going'): 4,
('due', 'to'): 5,
('a', 'pause'): 5,
('a', 'free'): 5,
('only', 'one'): 11,
('with', 'horror'): 9,
('me', 'yesterday'): 4,
('crowd', 'of'): 8,
('my', 'mother'): 23,
('and', 'stared'): 8,
('open', 'the'): 12,
('as', 'a'): 87,
('couldnt', 'do'): 4,
('his', 'spine'): 5,
('how', 'the'): 6,
('into', 'a'): 45,
('not', 'merely'): 5,
('he', 'waked'): 4,
('way', 'or'): 6,
('since', 'yesterday'): 5,
('sensation', 'he'): 5,
('this', 'time'): 32,
('me', 'you'): 18,
('answer', 'for'): 6,
('his', 'side'): 4,
('but', 'only'): 10,
('is', 'this'): 6,
('shaking', 'his'): 4,
('so', 'be'): 6,
('blurted', 'out'): 10,
('from', 'behind'): 4,
('you', 'do'): 25,
('we', 'must'): 20,
('with', 'her'): 70,
('take', 'her'): 8,
('honour', 'i'): 4,
('the', 'colour'): 4,
('came', 'out'): 22,
('almost', 'with'): 11,
('drew', 'a'): 5,
('or', 'four'): 9,
('course', 'you'): 7,
('began', 'with'): 6,
('upon', 'him'): 28,
('to', 'raskolnikov'): 29,
('out', 'but'): 6,
('you', 'everything'): 4,
('as', 'it'): 72,
('all', 'who'): 6,
('of', 'service'): 5,
('the', 'policeman'): 18,
('and', 'found'): 5,
('your', 'illness'): 9,
('wrong', 'in'): 5,
('then', 'she'): 22,
('looking', 'round'): 9,
('where', 'she'): 8,
('not', 'delirious'): 4,
('head', 'sink'): 4,
('black', 'bread'): 4,
('still', 'be'): 4,
('succeed', 'in'): 4,
('he', 'listened'): 12,
('dont', 'understand'): 20,
('he', 'really'): 10,
('need', 'not'): 7,
('there', 'you'): 7,
('to', 'do'): 104,
('seemed', 'to'): 100,
('all', 'it'): 6,
('you', 'made'): 7,
('knew', 'of'): 4,
('himself', 'but'): 14,
('to', 'return'): 10,
('dreaming', 'of'): 6,
('brought', 'him'): 8,
('will', 'and'): 6,
('speak', 'and'): 4,
('got', 'into'): 6,
('what', 'we'): 8,
('had', 'already'): 12,
('years', 'and'): 6,
('character', 'and'): 7,
('can', 'always'): 4,
('need', 'of'): 13,
('and', 'secondly'): 7,
('understand', 'how'): 5,
('and', 'fifteen'): 4,
('remember', 'what'): 7,
('lived', 'in'): 6,
('the', 'murdered'): 4,
('was', 'given'): 4,
('the', 'chair'): 10,
('i', 'wanted'): 28,
('at', 'night'): 18,
('the', 'fact'): 23,
('leads', 'to'): 4,
('not', 'answer'): 7,
('asked', 'raskolnikov'): 9,
('always', 'be'): 4,
('corner', 'of'): 8,
('i', 'trust'): 5,
('years', 'in'): 5,
('while', 'he'): 20,
('have', 'told'): 7,
('not', 'tell'): 7,
('to', 'pyotr'): 8,
('i', 'found'): 8,
('then', 'why'): 7,
('long', 'before'): 8,
('morning', 'at'): 4,
('ilya', 'petrovitch'): 32,
('every', 'word'): 10,
('id', 'been'): 6,
('that', 'question'): 9,
('them', 'i'): 17,
('street', 'it'): 4,
('ideas', 'and'): 4,
('dont', 'care'): 12,
('has', 'come'): 9,
('chapter', 'iv'): 6,
('asked', 'her'): 8,
('the', 'sly'): 5,
('remembered', 'that'): 11,
('flashing', 'eyes'): 4,
('are', 'going'): 10,
('another', 'he'): 5,
('got', 'hold'): 4,
('avdotya', 'romanovnas'): 6,
('nonsense', 'he'): 5,
('whether', 'you'): 8,
('are', 'many'): 4,
('of', 'real'): 4,
('it', 'must'): 29,
('said', 'addressing'): 7,
('may', 'not'): 9,
('didnt', 'want'): 7,
('smile', 'as'): 4,
('movement', 'and'): 5,
('there', 'but'): 5,
('he', 'worried'): 4,
('the', 'whip'): 4,
('does', 'she'): 8,
('the', 'purse'): 14,
('to', 'carry'): 9,
('in', 'bed'): 4,
('thought', 'i'): 11,
('arms', 'and'): 4,
('more', 'i'): 8,
('back', 'and'): 14,
('myself', 'up'): 6,
('have', 'thought'): 9,
('thinking', 'and'): 5,
('white', 'as'): 5,
('and', 'contemptuously'): 4,
('almost', 'in'): 9,
('were', 'shaking'): 4,
('rid', 'of'): 13,
('the', 'lower'): 4,
('there', 'have'): 4,
('going', 'in'): 7,
('they', 'know'): 8,
('out', 'again'): 5,
('the', 'article'): 5,
('didnt', 'mean'): 4,
('get', 'round'): 4,
('though', 'he'): 86,
('was', 'my'): 5,
('could', 'never'): 10,
('his', 'plans'): 4,
('herself', 'for'): 5,
('this', 'had'): 4,
('expressed', 'it'): 4,
('our', 'first'): 4,
('dont', 'be'): 37,
('too', 'you'): 5,
('it', 'when'): 10,
('off', 'to'): 11,
('time', 'ago'): 4,
('government', 'quarters'): 4,
('raskolnikov', 'cried'): 9,
('that', 'before'): 4,
('where', 'to'): 4,
('me', 'so'): 11,
('him', 'so'): 17,
('cried', 'raskolnikov'): 12,
('window', 'and'): 14,
('he', 'sat'): 19,
('make', 'up'): 7,
('house', 'was'): 5,
('it', 'does'): 10,
('almost', 'aloud'): 4,
('razumihin', 'shouted'): 6,
('goodbye', 'and'): 4,
('the', 'stone'): 10,
('and', 'without'): 11,
('take', 'you'): 9,
('answered', 'svidrigaïlov'): 4,
('all', 'is'): 11,
('holding', 'out'): 8,
('he', 'hid'): 4,
('have', 'the'): 27,
('all', 'such'): 4,
('last', 'moment'): 5,
('after', 'dinner'): 5,
('the', 'road'): 13,
('addressing', 'razumihin'): 4,
('word', 'of'): 8,
('ive', 'got'): 6,
('will', 'be'): 96,
('what', 'that'): 4,
('here', 'i'): 23,
('a', 'german'): 6,
('where', 'they'): 5,
('here', 'a'): 4,
('and', 'rather'): 10,
('trying', 'to'): 40,
('to', 'your'): 27,
('zametov', 'is'): 5,
('eyes', 'which'): 4,
('want', 'you'): 6,
('why', 'am'): 16,
('short', 'a'): 4,
('to', 'myself'): 13,
('might', 'not'): 9,
('you', 'must'): 55,
('and', 'he'): 241,
('him', 'more'): 7,
('his', 'lodgings'): 4,
('say', 'goodbye'): 5,
('up', 'at'): 16,
('you', 'now'): 13,
('somewhere', 'to'): 5,
('to', 'accept'): 8,
('i', 'only'): 33,
('that', 'sort'): 10,
('to', 'talk'): 20,
('a', 'high'): 5,
('last', 'she'): 4,
('his', 'recent'): 6,
('room', 'like'): 4,
('him', 'at'): 34,
('raskolnikov', 'answered'): 15,
('by', 'and'): 4,
('he', 'did'): 91,
('want', 'me'): 4,
('i', 'right'): 4,
('seen', 'and'): 5,
('no', 'facts'): 5,
('away', 'i'): 7,
('me', 'what'): 15,
('the', 'man'): 66,
('sir', 'he'): 5,
('she', 'wont'): 5,
('man', 'who'): 33,
('mine', 'and'): 4,
('read', 'the'): 4,
('her', 'in'): 30,
('i', 'wasnt'): 4,
('going', 'into'): 5,
('to', 'sing'): 8,
('a', 'step'): 10,
('us', 'a'): 5,
('your', 'opinion'): 4,
('the', 'better'): 13,
('a', 'fearful'): 7,
('hehehe', 'you'): 4,
('an', 'honest'): 5,
('to', 'believe'): 9,
('this', 'i'): 10,
('with', 'one'): 7,
('the', 'heart'): 11,
('his', 'arms'): 10,
('my', 'friend'): 10,
('hotly', 'and'): 5,
('its', 'nonsense'): 5,
('passed', 'between'): 7,
('if', 'they'): 22,
('of', 'tea'): 6,
('drew', 'him'): 5,
('left', 'in'): 8,
('very', 'difficult'): 4,
('suddenly', 'as'): 4,
('me', 'up'): 5,
('idea', 'struck'): 9,
('thinking', 'of'): 19,
('you', 'care'): 6,
('nonsense', 'i'): 5,
('of', 'which'): 22,
('another', 'i'): 4,
('if', 'that'): 6,
('the', 'accident'): 4,
('honourable', 'house'): 6,
('down', 'with'): 10,
('moment', 'later'): 6,
('shall', 'see'): 14,
('whats', 'this'): 5,
('wrapped', 'in'): 5,
('own', 'accord'): 5,
('a', 'tall'): 4,
('with', 'some'): 22,
('ill', 'make'): 6,
('seven', 'years'): 14,
('wrong', 'with'): 4,
('his', 'feet'): 7,
('be', 'ready'): 5,
('must', 'talk'): 4,
('him', 'when'): 7,
('the', 'conviction'): 4,
('raskolnikov', 'made'): 6,
('now', 'to'): 18,
('you', 'if'): 7,
('were', 'they'): 4,
('the', 'paper'): 20,
('day', 'the'): 5,
('in', 'delirium'): 6,
('it', 'said'): 7,
('has', 'only'): 5,
('been', 'too'): 4,
('about', 'her'): 29,
('cut', 'the'): 4,
('used', 'to'): 59,
('muttered', 'the'): 4,
('for', 'us'): 13,
('tell', 'you'): 112,
('had', 'two'): 7,
('youd', 'better'): 13,
('be', 'anxious'): 4,
('afanasy', 'ivanovitch'): 5,
('the', 'crowd'): 43,
('my', 'first'): 7,
('alexandrovna', 'was'): 15,
('man', 'again'): 4,
('all', 'right'): 31,
('set', 'of'): 4,
('open', 'and'): 12,
('haste', 'and'): 7,
('her', 'husband'): 14,
('them', 'the'): 12,
('felt', 'so'): 4,
('goodness', 'knows'): 6,
('not', 'this'): 5,
('so', 'far'): 11,
('consider', 'that'): 4,
('but', 'another'): 4,
('know', 'all'): 15,
('brother', 'he'): 7,
('hour', 'before'): 4,
('the', 'string'): 5,
('shall', 'i'): 29,
('understand', 'it'): 15,
('to', 'become'): 5,
('and', 'make'): 15,
('shoulders', 'and'): 4,
('the', 'gate'): 13,
('voice', 'and'): 4,
('can', 'we'): 4,
('a', 'cardsharper'): 4,
('chair', 'and'): 8,
('his', 'way'): 34,
('all', 'sorts'): 20,
('took', 'out'): 8,
('off', 'i'): 6,
('came', 'upon'): 6,
('of', 'marriage'): 4,
('table', 'he'): 4,
('more', 'honourable'): 5,
('for', 'instance'): 36,
('left', 'the'): 14,
('him', 'who'): 5,
('tears', 'and'): 5,
('was', 'better'): 6,
('was', 'dark'): 5,
('at', 'finding'): 4,
('rodion', 'romanovitch'): 86,
('isnt', 'it'): 17,
('glass', 'of'): 10,
('he', 'always'): 7,
('least', 'he'): 8,
('purpose', 'to'): 12,
('a', 'relation'): 7,
('is', 'my'): 14,
('proof', 'of'): 4,
('her', 'heart'): 13,
('rouble', 'and'): 8,
('is', 'now'): 5,
('a', 'nice'): 10,
('as', 'possible'): 23,
('the', 'position'): 8,
('broke', 'off'): 6,
('be', 'alone'): 6,
('keeping', 'with'): 6,
('to', 'defend'): 4,
('that', 'had'): 23,
('it', 'and'): 126,
('murder', 'and'): 7,
('not', 'what'): 7,
('of', 'taking'): 4,
('said', 'suddenly'): 13,
('a', 'cat'): 4,
('great', 'thing'): 6,
('mother', 'said'): 7,
('he', 'saw'): 41,
('intelligent', 'man'): 4,
('his', 'pale'): 4,
('right', 'side'): 4,
('am', 'the'): 7,
('both', 'sides'): 5,
('washing', 'the'): 4,
('come', 'rodya'): 4,
('state', 'of'): 13,
('idea', 'of'): 16,
('illness', 'he'): 4,
('think', 'you'): 12,
('a', 'crime'): 9,
('please', 'do'): 4,
('gentleman', 'was'): 5,
('better', 'and'): 4,
('whether', 'i'): 11,
('besides', 'he'): 4,
('to', 'explain'): 15,
('first', 'place'): 15,
('next', 'day'): 19,
('a', 'place'): 4,
('way', 'but'): 4,
('white', 'and'): 7,
('the', 'passersby'): 6,
('dirty', 'and'): 6,
('her', 'to'): 44,
('set', 'off'): 8,
('moment', 'a'): 4,
('in', 'their'): 22,
('which', 'had'): 17,
('is', 'true'): 7,
('i', 'agree'): 5,
('and', 'contempt'): 4,
('fever', 'he'): 4,
('get', 'her'): 6,
('be', 'the'): 29,
('in', 'every'): 11,
('a', 'fresh'): 4,
('not', 'more'): 9,
('though', 'in'): 11,
('walk', 'about'): 6,
('well', 'brother'): 5,
('they', 'heard'): 4,
('had', 'seen'): 24,
('ago', 'i'): 9,
('and', 'still'): 10,
('herself', 'to'): 6,
('their', 'eyes'): 8,
('so', 'i'): 33,
('your', 'sister'): 25,
('at', 'its'): 4,
('axe', 'and'): 9,
('showed', 'him'): 4,
('do', 'she'): 4,
('you', 'go'): 26,
('i', 'dont'): 163,
('would', 'he'): 6,
('i', 'to'): 33,
('a', 'cry'): 6,
('interrupted', 'with'): 7,
('besides', 'the'): 4,
('an', 'hour'): 45,
('that', 'as'): 11,
('its', 'being'): 5,
('more', 'to'): 8,
('was', 'completely'): 4,
('mother', 'he'): 7,
('up', 'that'): 5,
('not', 'made'): 4,
('his', 'elbow'): 5,
('you', 'only'): 8,
('opened', 'the'): 23,
('men', 'and'): 9,
('porfiry', 'but'): 4,
('indignation', 'at'): 4,
('svidrigaïlov', 'had'): 8,
('was', 'doing'): 12,
('they', 'went'): 6,
('his', 'illness'): 11,
('just', 'come'): 13,
('it', 'because'): 8,
('house', 'he'): 4,
('looking', 'down'): 6,
('table', 'in'): 7,
('very', 'young'): 4,
('plans', 'and'): 5,
('round', 'the'): 11,
('them', 'they'): 11,
('to', 'utter'): 5,
('fury', 'and'): 4,
('a', 'future'): 4,
('have', 'liked'): 5,
('down', 'and'): 18,
('alone', 'and'): 8,
('see', 'him'): 24,
('minutes', 'and'): 6,
('but', 'is'): 5,
('in', 'you'): 7,
('and', 'night'): 5,
('my', 'illness'): 4,
('his', 'senses'): 6,
('from', 'one'): 10,
('a', 'shock'): 4,
('your', 'brother'): 11,
('that', 'his'): 35,
('understand', 'the'): 5,
('in', 'keeping'): 5,
('have', 'only'): 14,
('you', 'in'): 25,
('elbow', 'on'): 5,
('into', 'his'): 40,
('and', 'also'): 7,
('i', 'was'): 193,
('the', 'form'): 4,
('get', 'married'): 6,
('had', 'an'): 4,
('them', 'sonia'): 4,
('and', 'put'): 17,
('of', 'all'): 53,
('ive', 'had'): 6,
('and', 'apparently'): 4,
('answer', 'and'): 4,
('see', 'clearly'): 4,
('hands', 'as'): 4,
('a', 'genuine'): 4,
('a', 'stone'): 10,
('give', 'it'): 13,
('they', 'came'): 9,
('met', 'him'): 8,
('very', 'likely'): 11,
('assistant', 'superintendent'): 8,
('speak', 'but'): 7,
('he', 'showed'): 5,
('in', 'i'): 14,
('right', 'he'): 4,
('eyes', 'met'): 4,
('but', 'youre'): 4,
('my', 'word'): 20,
('the', 'ceiling'): 4,
('ran', 'down'): 8,
('in', 'too'): 7,
('me', 'im'): 4,
('rest', 'on'): 4,
('that', 'was'): 67,
('is', 'there'): 23,
('it', 'in'): 40,
('and', 'be'): 6,
('off', 'at'): 5,
('the', 'name'): 5,
('go', 'home'): 6,
('the', 'railing'): 5,
('who', 'killed'): 5,
('would', 'do'): 5,
('happiness', 'of'): 7,
('one', 'moment'): 7,
('now', 'he'): 42,
('find', 'her'): 4,
('and', 'had'): 55,
('away', 'the'): 15,
('story', 'of'): 8,
('she', 'took'): 10,
('him', 'this'): 10,
('me', 'this'): 8,
('can', 'she'): 5,
('round', 'him'): 9,
('will', 'begin'): 4,
('with', 'three'): 5,
('asked', 'himself'): 6,
('would', 'be'): 106,
('havent', 'i'): 4,
('you', 'of'): 7,
('much', 'so'): 9,
('the', 'arm'): 4,
('such', 'cases'): 4,
('though', 'katerina'): 5,
('raskolnikov', 'stopped'): 5,
('cant', 'be'): 18,
('an', 'absurd'): 4,
('like', 'the'): 13,
('him', 'let'): 4,
('better', 'for'): 11,
('ask', 'forgiveness'): 4,
('especially', 'in'): 5,
('the', 'islands'): 4,
('no', 'thats'): 8,
('had', 'known'): 8,
('such', 'things'): 4,
('case', 'with'): 4,
('it', 'is'): 161,
('before', 'i'): 5,
('to', 'understand'): 16,
('heard', 'all'): 4,
('was', 'all'): 27,
('sign', 'of'): 6,
('was', 'full'): 4,
('for', 'having'): 7,
('since', 'then'): 4,
('sort', 'i'): 4,
('very', 'much'): 33,
('could', 'only'): 14,
('with', 'nervous'): 4,
('far', 'away'): 4,
('sonia', 'stood'): 4,
('bed', 'with'): 4,
('fear', 'of'): 6,
('thought', 'to'): 9,
('anxiety', 'and'): 7,
('till', 'we'): 7,
('money', 'in'): 5,
('stopped', 'short'): 13,
('although', 'i'): 5,
('your', 'mother'): 16,
('one', 'and'): 11,
('breath', 'of'): 4,
('quite', 'in'): 4,
('shall', 'never'): 6,
('now', 'i'): 53,
('i', 'simply'): 14,
('today', 'that'): 5,
('every', 'step'): 5,
('said', 'in'): 10,
('to', 'get'): 76,
('she', 'gave'): 13,
('turned', 'round'): 8,
('present', 'from'): 5,
('nothing', 'in'): 9,
('me', 'but'): 36,
('is', 'a'): 137,
('a', 'level'): 6,
('for', 'everything'): 6,
('do', 'they'): 6,
('it', 'out'): 25,
('o', 'u'): 16,
('he', 'gazed'): 11,
('he', 'ought'): 10,
('hold', 'out'): 4,
('two', 'hours'): 4,
('social', 'position'): 4,
('he', 'whispered'): 8,
('another', 'and'): 7,
('me', 'of'): 8,
('since', 'i'): 9,
('sank', 'into'): 19,
('a', 'pity'): 8,
('yesterday', 'at'): 4,
('thats', 'it'): 14,
('in', 'moscow'): 6,
('but', 'of'): 6,
('up', 'she'): 4,
('and', 'can'): 7,
('felt', 'a'): 18,
('lay', 'the'): 4,
('or', 'whether'): 4,
('out', 'to'): 22,
('possible', 'to'): 9,
('talked', 'of'): 6,
('shouted', 'razumihin'): 5,
('eyes', 'in'): 6,
('it', 'really'): 5,
('illness', 'was'): 4,
('but', 'after'): 4,
('an', 'important'): 7,
('ashamed', 'and'): 4,
('no', 'interest'): 4,
('looked', 'in'): 8,
('could', 'be'): 41,
('there', 'and'): 17,
('paid', 'for'): 7,
('could', 'you'): 17,
('each', 'other'): 28,
('i', 'knew'): 27,
('yes', 'theres'): 5,
('a', 'dozen'): 5,
('legal', 'marriage'): 4,
('doing', 'it'): 5,
('to', 'know'): 44,
('a', 'doctor'): 12,
('with', 'blood'): 13,
('they', 'sat'): 4,
('talking', 'in'): 6,
('thought', 'as'): 4,
('that', 'he'): 335,
('the', 'prison'): 6,
('uneasily', 'at'): 4,
('perhaps', 'the'): 4,
('oh', 'thats'): 6,
('more', 'of'): 4,
('got', 'a'): 10,
('look', 'here'): 5,
('there', 'had'): 8,
('them', 'so'): 9,
('or', 'less'): 5,
('does', 'it'): 20,
('not', 'ill'): 4,
('last', 'to'): 4,
('had', 'just'): 34,
('youre', 'right'): 6,
('i', 'would'): 38,
('i', 'used'): 10,
('were', 'standing'): 4,
('name', 'of'): 6,
('pay', 'him'): 4,
('cried', 'dounia'): 12,
('he', 'wont'): 7,
('kissing', 'her'): 4,
('answer', 'he'): 6,
('and', 'both'): 8,
('been', 'taken'): 7,
('could', 'hear'): 10,
('covered', 'with'): 21,
('my', 'god'): 9,
('have', 'something'): 6,
('he', 'made'): 29,
('his', 'shoulder'): 5,
('not', 'remember'): 5,
('it', 'thats'): 15,
('cry', 'of'): 4,
('heard', 'it'): 12,
('crossed', 'the'): 4,
('yes', 'that'): 4,
('that', 'the'): 109,
('you', 'who'): 9,
('before', 'in'): 6,
('what', 'what'): 7,
('mr', 'lebeziatnikov'): 16,
('no', 'business'): 4,
('almost', 'the'): 5,
('day', 'at'): 5,
('yes', 'indeed'): 4,
('what', 'sort'): 8,
('if', 'id'): 5,
('both', 'his'): 4,
('to', 'give'): 40,
('me', 'for'): 32,
('was', 'it'): 42,
('way', 'in'): 8,
('he', 'shouted'): 27,
('doubt', 'he'): 4,
('a', 'change'): 4,
('youve', 'heard'): 4,
('porfiry', 'and'): 6,
('are', 'two'): 4,
('door', 'opened'): 9,
('you', 'suppose'): 11,
('sent', 'a'): 6,
('amalia', 'ludwigovna'): 8,
('the', 'cemetery'): 6,
('get', 'away'): 10,
('sister', 'i'): 5,
('a', 'general'): 6,
('he', 'let'): 4,
('ive', 'been'): 43,
('seat', 'and'): 6,
('happened', 'i'): 4,
('at', 'our'): 4,
('tell', 'him'): 4,
('hour', 'later'): 8,
('svidrigaïlov', 'with'): 5,
('i', 'told'): 31,
('im', 'a'): 4,
('he', 'smiled'): 5,
('murdered', 'her'): 4,
('see', 'that'): 26,
('to', 'face'): 9,
('month', 'ago'): 7,
('with', 'hatred'): 4,
('besides', 'i'): 9,
('something', 'from'): 6,
('why', 'it'): 8,
('the', 'little'): 29,
('subject', 'and'): 4,
('is', 'something'): 5,
('semyon', 'zaharovitch'): 10,
('he', 'stopped'): 14,
('himself', 'with'): 10,
('silence', 'followed'): 4,
('you', 'gave'): 7,
('so', 'that'): 94,
('his', 'seat'): 6,
('on', 'one'): 16,
('so', 'the'): 9,
('up', 'everything'): 7,
('the', 'bridge'): 19,
('times', 'when'): 4,
('coming', 'here'): 7,
('keep', 'him'): 5,
('you', 'porfiry'): 5,
('the', 'sounds'): 4,
('second', 'storey'): 4,
('her', 'daughter'): 12,
('left', 'alone'): 8,
('heart', 'as'): 4,
('listening', 'to'): 7,
('for', 'herself'): 5,
('sometimes', 'he'): 5,
('found', 'a'): 9,
('door', 'was'): 18,
('i', 'assure'): 26,
('bound', 'to'): 11,
('beginning', 'to'): 13,
('followed', 'her'): 5,
('sensible', 'woman'): 4,
('think', 'i'): 12,
('the', 'canal'): 21,
('are', 'at'): 4,
('only', 'wanted'): 6,
('he', 'didnt'): 6,
('he', 'almost'): 8,
('believe', 'me'): 21,
('one', 'cant'): 7,
('call', 'me'): 4,
('this', 'way'): 12,
('of', 'yours'): 15,
('sent', 'me'): 7,
('of', 'fever'): 4,
('we', 'to'): 7,
('was', 'even'): 9,
('how', 'a'): 5,
('a', 'thousand'): 14,
...}
Content source: eecs445-f16/umich-eecs445-f16
Similar notebooks: